{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e7e97dfe-a53f-4b6c-bfe6-96d9cc98e1cd",
   "metadata": {},
   "source": [
    "#  Import modules and connect to Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2ad8cb1-1b47-4fd6-8454-7e6e034ff5b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from getpass import getpass\n",
    "from elasticsearch import Elasticsearch, helpers\n",
    "import wget, zipfile, pandas as pd, json, openai\n",
    "import streamlit as st\n",
    "from tqdm.notebook import tqdm\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "# openai_api_key=os.getenv('OPENAI_API_KEY')\n",
    "elastic_user = os.getenv('ES_USER')\n",
    "elastic_password = os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint = os.getenv(\"ES_ENDPOINT\")\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    " \n",
    "print(client.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2065018b-1db5-48c1-9f89-3464b98f807e",
   "metadata": {},
   "source": [
    "# Configure OpenAI connection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05acda13-7086-4476-bb0e-ee868b52ce8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "# from openai import AzureOpenAI\n",
    "\n",
    "# openai = OpenAI()\n",
    "# openai.models.retrieve(\"text-embedding-ada-002\")\n",
    "\n",
    "azure_api_key = os.getenv('AZURE_API_KEY')\n",
    "azure_endpoint = os.getenv('AZURE_EDNPOINT')\n",
    "azure_api_version = os.getenv('AZURE_API_VERSION')\n",
    "azure_deployment_id = os.getenv('AZURE_DEPLOYMENT_ID')\n",
    "\n",
    "openai.api_type = \"azure\"\n",
    "openai.api_base = azure_endpoint\n",
    "openai.api_version = azure_api_version\n",
    "openai.api_key = azure_api_key"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ce15dc5-75a6-47b5-9b18-52c873d4aec7",
   "metadata": {},
   "source": [
    "# Download the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b08aff11-cfb1-4382-9a5a-ed4f0a476bbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\n",
    "\"r\") as zip_ref:\n",
    "    zip_ref.extractall(\"data\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49424e92-b5b8-4919-861c-6fa597c6e396",
   "metadata": {},
   "source": [
    "# Read CSV file into a Pandas DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7cb7b01b-f39a-4887-812b-06d8037c67ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "wikipedia_dataframe = pd.read_csv(\"data/vector_database_wikipedia_articles_embedded.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ed584af-60d0-4d64-8fb6-e1476dbb48b1",
   "metadata": {},
   "source": [
    "# Create index with mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8dce7e96-150e-4435-a5ee-93f96b9c7a08",
   "metadata": {},
   "outputs": [],
   "source": [
    "index_mapping= {\n",
    "    \"properties\": {\n",
    "      \"title_vector\": {\n",
    "          \"type\": \"dense_vector\",\n",
    "          \"dims\": 1536,\n",
    "          \"index\": \"true\",\n",
    "          \"similarity\": \"cosine\"\n",
    "      },\n",
    "      \"content_vector\": {\n",
    "          \"type\": \"dense_vector\",\n",
    "          \"dims\": 1536,\n",
    "          \"index\": \"true\",\n",
    "          \"similarity\": \"cosine\"\n",
    "      },\n",
    "      \"text\": {\"type\": \"text\"},\n",
    "      \"title\": {\"type\": \"text\"},\n",
    "      \"url\": { \"type\": \"keyword\"},\n",
    "      \"vector_id\": {\"type\": \"long\"}\n",
    "\n",
    "    }\n",
    "}\n",
    "client.indices.create(index=\"wikipedia_vector_index\", mappings=index_mapping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4717a495-253b-4fae-abfd-c5574cbf10f2",
   "metadata": {},
   "source": [
    "# Index data into Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4e46f1c0-b10e-4c5d-bc0e-db3aa47be2af",
   "metadata": {},
   "outputs": [],
   "source": [
    "def dataframe_to_bulk_actions(df):\n",
    "    for index, row in df.iterrows():\n",
    "        yield {\n",
    "            \"_index\": 'wikipedia_vector_index',\n",
    "            \"_id\": row['id'],\n",
    "            \"_source\": {\n",
    "                'url' : row[\"url\"],\n",
    "                'title' : row[\"title\"],\n",
    "                'text' : row[\"text\"],\n",
    "                'title_vector' : json.loads(row[\"title_vector\"]),\n",
    "                'content_vector' : json.loads(row[\"content_vector\"]),\n",
    "                'vector_id' : row[\"vector_id\"]\n",
    "            }\n",
    "        }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0e112f3c-14bc-4ada-9a45-c9c0fdd6c109",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "87d88b0b76cd4b1584eb9c045cc55e09",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/25000 [00:00<?, ?documents/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "total_documents = len(wikipedia_dataframe)\n",
    "\n",
    "progress_bar = tqdm(total=total_documents, unit=\"documents\")\n",
    "success_count = 0\n",
    "\n",
    "for ok, info in helpers.streaming_bulk(client, actions=dataframe_to_bulk_actions(wikipedia_dataframe), raise_on_error=False, chunk_size=100):\n",
    "  if ok:\n",
    "    success_count += 1\n",
    "  else:\n",
    "    print(f\"Unable to index {info['index']['_id']}: {info['index']['error']}\")\n",
    "  progress_bar.update(1)\n",
    "  progress_bar.set_postfix(success=success_count)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c012595-727c-48f2-a676-6bff6e9ebe61",
   "metadata": {},
   "source": [
    "# Build application with Streamlit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "489ab74e-5842-4329-b401-bc4d419b5b8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "!npm install localtunnel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0da4a991-127b-45d4-b16a-cfae5fb07f10",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile app.py\n",
    "\n",
    "import os\n",
    "import streamlit as st\n",
    "# import openai\n",
    "from openai import AzureOpenAI\n",
    "from elasticsearch import Elasticsearch\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# from openai import OpenAI\n",
    "\n",
    "# openai = OpenAI()\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "azure_api_key = os.getenv('AZURE_API_KEY')\n",
    "azure_endpoint = os.getenv('AZURE_EDNPOINT')\n",
    "azure_api_version = os.getenv('AZURE_API_VERSION')\n",
    "azure_deployment_id = os.getenv('AZURE_DEPLOYMENT_ID')\n",
    "\n",
    "chat = AzureOpenAI(\n",
    "  api_key = azure_api_key,  \n",
    "  api_version = azure_api_version,\n",
    "  azure_endpoint = azure_endpoint\n",
    ")\n",
    "\n",
    "model_name = os.getenv('MODEL_NAME')\n",
    "azure_embedding_endpoint = os.getenv('AZURE_EMBEDDING_ENDPOINT')\n",
    "azure_embedding_api_key = os.getenv('AZURE_EMBEDDING_API_KEY')\n",
    "azure_embedding_api_version = os.getenv(\"AZURE_EMBEDDING_API_VERSION\")\n",
    "\n",
    "embeddings = AzureOpenAI(\n",
    "        api_key=azure_embedding_api_key,\n",
    "        api_version=azure_embedding_api_version,\n",
    "        azure_endpoint=azure_embedding_endpoint,\n",
    "    )\n",
    "\n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    "\n",
    "# openai_api_key=os.getenv('OPENAI_API_KEY')\n",
    "\n",
    "# openai.api_type = \"azure\"\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    "\n",
    "# Define model\n",
    "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n",
    "\n",
    "def openai_summarize(query, response):\n",
    "    context = response['hits']['hits'][0]['_source']['text']\n",
    "    summary = chat.chat.completions.create(\n",
    "    model = azure_deployment_id,\n",
    "    messages=[\n",
    "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "            {\"role\": \"user\", \"content\": \"Answer the following question:\" + query + \"by using the following text: \" + context},\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    print(summary)\n",
    "    return summary.choices[0].message.content\n",
    "\n",
    "\n",
    "def search_es(query):\n",
    "    # Create embedding\n",
    "    question_embedding = embeddings.embeddings.create(input=query, model=EMBEDDING_MODEL)\n",
    "\n",
    "    # Define Elasticsearch query\n",
    "    response = client.search(\n",
    "    index = \"wikipedia_vector_index\",\n",
    "    knn={\n",
    "        \"field\": \"content_vector\",\n",
    "        \"query_vector\":  question_embedding.data[0].embedding,\n",
    "        \"k\": 10,\n",
    "        \"num_candidates\": 100\n",
    "        }\n",
    "    )\n",
    "    return response\n",
    "\n",
    "\n",
    "def main():\n",
    "    st.title(\"Gen AI Application\")\n",
    "\n",
    "    # Input for user search query\n",
    "    user_query = st.text_input(\"Enter your question:\", \"what is football?\")\n",
    "\n",
    "    if st.button(\"Search\"):\n",
    "        if user_query:\n",
    "\n",
    "            st.write(f\"Searching for: {user_query}\")\n",
    "            result = search_es(user_query)\n",
    "\n",
    "            # print(result)\n",
    "            openai_summary = openai_summarize(user_query, result)\n",
    "            st.write(f\"OpenAI Summary: {openai_summary}\")\n",
    "\n",
    "            # Display search results\n",
    "            if result['hits']['total']['value'] > 0:\n",
    "                st.write(\"Search Results:\")\n",
    "                for hit in result['hits']['hits']:\n",
    "                    st.write(hit['_source']['title'])\n",
    "                    st.write(hit['_source']['text'])\n",
    "            else:\n",
    "                st.write(\"No results found.\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df034dc0-dbce-449e-a9eb-2eedcafeb7b7",
   "metadata": {},
   "source": [
    "# Run the application"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc4c16f9-0d5b-447c-a6fa-ca7811fafa08",
   "metadata": {},
   "outputs": [],
   "source": [
    "!streamlit run app.py"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
